"""

    Cumpare is a python library and program for file comparison.
    Copyright (C) 2010 Antonio Lima <anto87@gmail.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.


"""


import sys
import os
import itertools
import hashlib


class AbstractComparer(object):
    """AbstractComparer class defines the generic comparison behavior.
    
    Its constructor accepts a list of fullpath filenames. Concrete comparer
    who subclass AbstractComparer class should override _compfunc function,
    which is responsible of generating an attribute for each file.

    """
    def __init__(self, fnames):
        self.fnames = fnames
        self.fdict = dict()

    def _compfunc(self, fname):
        """This method defines the type of the comparison.

        The function accepts a full-path filename and should output an
        appropriate attribute. For instance, a hash-based comparison should
        return the hash digest of the file content, while a hypothetical
        image duplicates finder should output an object that describes
        image properties.

        """
        raise NotImplementedError

    def compare(self):
        """The core comparison method, builds a dictionary of lists.

        The dictionary key is the attribute generated by _compfunc, while
        the relative value is a list of fnames that produced the same
        attribute.

        """
        compfunc = self._compfunc
        for fname in self.fnames:
            attr = compfunc(fname)
            if attr not in self.fdict:
                self.fdict[attr] = [fname]
            else:
                self.fdict[attr].append(fname)
        return self.similarGroups()

    def similarGroups(self):
        return [group for group in self.fdict.itervalues()
                if len(group)>1]

class SizeComparer(AbstractComparer):
    """A comparer based on the size of the file"""

    def _compfunc(self, fname):
        return os.path.getsize(fname)

class HashComparer(AbstractComparer):
    """A abstract comparer based on the hash of the file.
    
    Concrete classes should override the _hashObj method.
    """

    def _hashObj(self):
        raise NotImplementedError

    def _compfunc(self, fname):
        with open(fname) as file:
            h = self._hashObj()
            fcontent = file.read()
            h.update(fcontent)
            return h.hexdigest()

class MD5Comparer(HashComparer):
    def _hashObj(self):
        return hashlib.md5()

class SHA1Comparer(HashComparer):
    def _hashObj(self):
        return hashlib.sha1()

class CumpareJob(object):
    """Defines a new job."""

    def __init__(self, directory, size=True, sha1=True, md5=False):
        self.directory = directory
        self.size_opt = size
        self.sha1_opt = sha1
        self.md5_opt = md5

    @property
    def fnames(self):
        return [os.path.join(root,fname)
            for root, dirs, files in os.walk(self.directory)
            for fname in files]

    def execute(self):
        self.findDupes()
        self.action()
        self.printDupes()

    def findDupes(self):
        fnames = self.fnames
        dupesGroups = None
        
        if self.size_opt:
            comp = SizeComparer(fnames)
            dupesGroups = comp.compare()
        if self.md5_opt:
            fnames = itertools.chain(*dupesGroups) # flatten the dict
            comp = MD5Comparer(fnames)
            dupesGroups = comp.compare()
        if self.sha1_opt:
            fnames = itertools.chain(*dupesGroups)
            comp = SHA1Comparer(fnames)
            dupesGroups = comp.compare()
        self.dupesGroups = dupesGroups
        return dupesGroups

    def printDupes(self):
        print len(self.dupesGroups), "unique file(s) found."

        for dupeGroup in self.dupesGroups:
            for dupe in dupeGroup:
                print dupe,
            print "\n"

    def action(self):
        pass

if __name__ == "__main__":
    job = CumpareJob(sys.argv[1])
    job.execute()
